## load gapminder
suppressPackageStartupMessages(library(gapminder))
## load tidyverse
suppressPackageStartupMessages(library(tidyverse))
## load forcats
suppressPackageStartupMessages(library(forcats))
## load plotly
suppressPackageStartupMessages(library(plotly))
Before dropping Oceania, let’s check the number of rows in gapminder, and the levels of continent.
gapminder %>%
# get the number of rows and levels of continent
summarize(
nrow = nrow(gapminder),
nlevels = nlevels(gapminder$continent)
) %>%
# show the table
knitr::kable(col.names = c("Number of rows", "Levels of `continent`"))
| Number of rows | Levels of continent |
|---|---|
| 1704 | 5 |
Let’s first try to drop Oceania, and show the number of rows, and the levels of continent.
# drop Oceania
drop <- gapminder %>%
filter(continent != "Oceania")
drop %>%
# get the number of rows and levels of continent
summarize(
nrow = nrow(drop),
nlevels = nlevels(drop$continent)
) %>%
# show the table
knitr::kable(col.names = c("Number of rows after dropping", "Levels of `continent` after dropping"))
| Number of rows after dropping | Levels of continent after dropping |
|---|---|
| 1680 | 5 |
Here we can see, even we remove all rows associated with the continent of Oceania (so the number of rows decrease), the levels of continent is unchanged. However, it is a wrong value. To fix it, we need to droplevels() to solve the issue.
# drop levels
drop_levels <- drop %>%
droplevels()
drop_levels %>%
# get the number of rows and levels of continent
summarize(
nrow = nrow(drop_levels),
nlevels = nlevels(drop_levels$continent)
) %>%
# show the table
knitr::kable(col.names = c("Number of rows after dropping levels", "Levels of `continent` after dropping levels"))
| Number of rows after dropping levels | Levels of continent after dropping levels |
|---|---|
| 1680 | 4 |
To summarize, we address the number of rows and the levels of continent before and after removing Oceania.
# build a tibble with statistics
summary <- cbind(
before = c(
nrow(gapminder),
nlevels(gapminder$continent)
),
after = c(
nrow(drop_levels),
nlevels(drop_levels$continent)
)
)
# assign row names
rownames(summary) <- c("Number of rows", "Levels")
# display the table
summary %>%
knitr::kable()
| before | after | |
|---|---|---|
| Number of rows | 1704 | 1680 |
| Levels | 5 | 4 |
country and continentLet’s show a preview of our dropped version of gapminder before reordering.
# show previews
head(drop_levels) %>%
knitr::kable()
| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.4453 |
| Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.8530 |
| Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.1007 |
| Afghanistan | Asia | 1967 | 34.020 | 11537966 | 836.1971 |
| Afghanistan | Asia | 1972 | 36.088 | 13079460 | 739.9811 |
| Afghanistan | Asia | 1977 | 38.438 | 14880372 | 786.1134 |
tail(drop_levels) %>%
knitr::kable()
| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| Zimbabwe | Africa | 1982 | 60.363 | 7636524 | 788.8550 |
| Zimbabwe | Africa | 1987 | 62.351 | 9216418 | 706.1573 |
| Zimbabwe | Africa | 1992 | 60.377 | 10704340 | 693.4208 |
| Zimbabwe | Africa | 1997 | 46.809 | 11404948 | 792.4500 |
| Zimbabwe | Africa | 2002 | 39.989 | 11926563 | 672.0386 |
| Zimbabwe | Africa | 2007 | 43.487 | 12311143 | 469.7093 |
From a bigger picture, this data frame is ordered by the alphabetical order of country. Let’s try to reorder the levels of country by the maximum population over the years using fct_reorder(). Note that it is reordered in ascending order, so the first level is the one with least maximum population over the year.
fct_reorder(
# reorder country
drop_levels$country,
# by pop
drop_levels$pop,
# using maximum
max) %>%
# show preview of resultant levels
levels() %>%
head() %>%
knitr::kable(col.names = c("`country` after reordering"))
country after reordering |
|---|
| Sao Tome and Principe |
| Iceland |
| Djibouti |
| Equatorial Guinea |
| Bahrain |
| Comoros |
In order to check our results, we manually calculate the maximum population and order it.
drop_levels %>%
# group by country
group_by(country) %>%
# calcuate maximum population for each country
summarize(
max_pop = max(pop)
) %>%
# arrange by max_pop
arrange(max_pop) %>%
# show preview
head() %>%
knitr::kable(col.names = c("country", "Maximum population over the years"))
| country | Maximum population over the years |
|---|---|
| Sao Tome and Principe | 199579 |
| Iceland | 301931 |
| Djibouti | 496374 |
| Equatorial Guinea | 551201 |
| Bahrain | 708573 |
| Comoros | 710960 |
So our operation using fct_reorder is correct.
We can do the same thing on continent. For example, we reorder its levels by the average life expectancy.
fct_reorder(
# reorder continent
drop_levels$continent,
# by lifeExp
drop_levels$lifeExp,
# using mean
mean) %>%
# show preview of resultant levels
levels() %>%
head() %>%
knitr::kable(col.names = c("`continent` after reordering"))
continent after reordering |
|---|
| Africa |
| Asia |
| Americas |
| Europe |
We also double-check the results.
drop_levels %>%
# group by continent
group_by(continent) %>%
# calcuate mean lifeExp for each continent
summarize(
mean_lifeExp = mean(lifeExp)
) %>%
# arrange by mean_lifeExp
arrange(mean_lifeExp) %>%
# show preview
head() %>%
knitr::kable(col.names = c("continent", "Average life expectancy over the years"))
| continent | Average life expectancy over the years |
|---|---|
| Africa | 48.86533 |
| Asia | 60.06490 |
| Americas | 64.65874 |
| Europe | 71.90369 |
In the previous section, we try to create two examples using fct_reorder() and arrange(), with identical results. Now, we try to reuse the first example and see if these two functions affect figures generated.
Let’s try the first example, using fct_reorder() only.
drop_levels %>%
# try to only show a continent
filter(continent == "Americas") %>%
# group by country
group_by(country) %>%
# calcuate maximum population for each country
mutate(
max_pop = max(pop)
) %>%
# show preview of resultant levels
ggplot(aes(x = max_pop, y = fct_reorder(country, pop, max), color = country)) +
# make it a scatterplot
geom_point() +
# scale x axis by log10
scale_x_log10() +
# change axis labels
xlab("Maximum population") +
ylab("country")
Let’s do it with arrange() only.
drop_levels %>%
# try to only show a continent
filter(continent == "Americas") %>%
# group by country
group_by(country) %>%
# calcuate maximum population for each country
summarize(
max_pop = max(pop)
) %>%
# arrange by max_pop
arrange(max_pop) %>%
# show preview of resultant levels
ggplot(aes(x = max_pop, y = country, color = country)) +
# make it a scatterplot
geom_point() +
# scale x axis by log10
scale_x_log10() +
# change axis labels
xlab("Maximum population") +
ylab("country")
Now we use fct_reorder() with arrange().
drop_levels %>%
# try to only show a continent
filter(continent == "Americas") %>%
# group by country
group_by(country) %>%
# calcuate maximum population for each country
mutate(
max_pop = max(pop)
) %>%
# arrange by max_pop
arrange(max_pop) %>%
# show preview of resultant levels
ggplot(aes(x = max_pop, y = fct_reorder(country, pop, max), color = country)) +
# make it a scatterplot
geom_point() +
# scale x axis by log10
scale_x_log10() +
# change axis labels
xlab("Maximum population") +
ylab("country")
Finding(s): using fct_reorder() (no matter with arrange() or not) will change the order of levels shown in figures, while using arrange() along will not affect the order of levels.
In order to test if mutated data can survire the round trip of writing and then reading back from a file, we first use the first example in Part 1 to mutate gapminder.
data <- gapminder %>%
# mutate country by maximum population
mutate(
country = fct_reorder(country, pop, max)
) %>%
# group by country
group_by(country) %>%
# calculate maximum population
summarize(
max_pop = max(pop)
)
# show summary of the data
data %>%
glimpse()
## Observations: 142
## Variables: 2
## $ country <fct> Sao Tome and Principe, Iceland, Djibouti, Equatorial G...
## $ max_pop <dbl> 199579, 301931, 496374, 551201, 708573, 710960, 720230...
# show previews
head(data) %>%
knitr::kable()
| country | max_pop |
|---|---|
| Sao Tome and Principe | 199579 |
| Iceland | 301931 |
| Djibouti | 496374 |
| Equatorial Guinea | 551201 |
| Bahrain | 708573 |
| Comoros | 710960 |
tail(data) %>%
knitr::kable()
| country | max_pop |
|---|---|
| Pakistan | 169270617 |
| Brazil | 190010647 |
| Indonesia | 223547000 |
| United States | 301139947 |
| India | 1110396331 |
| China | 1318683096 |
write_csv()/read_csv()# write to csv
write_csv(data, "data_csv.csv")
# read from csv
data_csv <- read_csv("data_csv.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## max_pop = col_double()
## )
# show summary of the data
data_csv %>%
glimpse()
## Observations: 142
## Variables: 2
## $ country <chr> "Sao Tome and Principe", "Iceland", "Djibouti", "Equat...
## $ max_pop <dbl> 199579, 301931, 496374, 551201, 708573, 710960, 720230...
# show previews
head(data_csv) %>%
knitr::kable()
| country | max_pop |
|---|---|
| Sao Tome and Principe | 199579 |
| Iceland | 301931 |
| Djibouti | 496374 |
| Equatorial Guinea | 551201 |
| Bahrain | 708573 |
| Comoros | 710960 |
tail(data_csv) %>%
knitr::kable()
| country | max_pop |
|---|---|
| Pakistan | 169270617 |
| Brazil | 190010647 |
| Indonesia | 223547000 |
| United States | 301139947 |
| India | 1110396331 |
| China | 1318683096 |
Finding(s): using CSV format, it requires R to parse each column with default formats. Therefore, the class of country changes from <fct> to <chr>. However, the data keeps unchanged.
saveRDS()/readRDS()# save to rds
saveRDS(data, "data_rds.rds")
# read from rds
data_rds <- readRDS("data_rds.rds")
# show summary of the data
data_rds %>%
glimpse()
## Observations: 142
## Variables: 2
## $ country <fct> Sao Tome and Principe, Iceland, Djibouti, Equatorial G...
## $ max_pop <dbl> 199579, 301931, 496374, 551201, 708573, 710960, 720230...
# show previews
head(data_rds) %>%
knitr::kable()
| country | max_pop |
|---|---|
| Sao Tome and Principe | 199579 |
| Iceland | 301931 |
| Djibouti | 496374 |
| Equatorial Guinea | 551201 |
| Bahrain | 708573 |
| Comoros | 710960 |
tail(data_rds) %>%
knitr::kable()
| country | max_pop |
|---|---|
| Pakistan | 169270617 |
| Brazil | 190010647 |
| Indonesia | 223547000 |
| United States | 301139947 |
| India | 1110396331 |
| China | 1318683096 |
Finding(s): saveRDS() and readRDS() can keep both data and classes of each column.
dput()/dget()# put to file
dput(data, "data.txt")
# get from file
data_get <- dget("data.txt")
# show summary of the data
data_get %>%
glimpse()
## Observations: 142
## Variables: 2
## $ country <fct> Sao Tome and Principe, Iceland, Djibouti, Equatorial G...
## $ max_pop <dbl> 199579, 301931, 496374, 551201, 708573, 710960, 720230...
# show previews
head(data_get) %>%
knitr::kable()
| country | max_pop |
|---|---|
| Sao Tome and Principe | 199579 |
| Iceland | 301931 |
| Djibouti | 496374 |
| Equatorial Guinea | 551201 |
| Bahrain | 708573 |
| Comoros | 710960 |
tail(data_get) %>%
knitr::kable()
| country | max_pop |
|---|---|
| Pakistan | 169270617 |
| Brazil | 190010647 |
| Indonesia | 223547000 |
| United States | 301139947 |
| India | 1110396331 |
| China | 1318683096 |
Finding(s): dput() and dget() can also keep both data and classes of each column.
Here I want to remake an figure I used in the first assignment. It tries to show the trends of population over the years for each continent.
p_ori <- gapminder %>%
# year as x axis and pop as y axis
ggplot(aes(x = year, y = pop)) +
# scale y axis by log10
scale_y_log10() +
# make it a scatterplot, and add transparancy
geom_point(alpha = 0.1) +
# show colors
aes(color = continent) +
# facetting by continent
facet_wrap(~ continent)
p_ori
We cannot get too much information for this figure. Therefore, we try to tidy up data, and make better presentation (e,g, color, size) to make it more reasonable.
# tidy up data
tidy_up <- gapminder %>%
# group by year and continent
group_by(year, continent) %>%
# calcuate summaries of pop
summarize(
min_pop = min(pop),
mean_pop = mean(pop),
max_pop = max(pop)
) %>%
# gather pop into two columns
gather(
key = "pop_attribute",
value = "pop_val",
min_pop, mean_pop, max_pop
)
# make new plot
p_new <- tidy_up %>%
# year as x axis and pop as y axis, group by pop_attribute
ggplot(aes(x = year, y = pop_val, color = pop_attribute, group = pop_attribute)) +
# scale y axis by log10
scale_y_log10() +
# make it a line plot (with points)
geom_point() +
geom_line() +
# facetting by continent
facet_wrap(~ continent) +
# modify labels
ylab("Population distribution") +
scale_color_discrete("Population distribution") +
# make a better x axis
scale_x_continuous(breaks = scales::pretty_breaks(n = 4))
p_new
Difference(s): the color in the new figure is not for each continent, but for different distributions of each continent, which makes more sense. In addition, we try to tidy up data into different distribution attributes, so the trends over the years are clearer.
plotly is an R package for creating interactive web-based graphs. In the following simple example, the differences of using plotly are:
# convert a ggplot into plotly
ggplotly(p_new)
ggsave()Let’s play around arguments of ggsave(). For example, width, height, resolution, and text scale.
# default width and height
ggsave("./ggsave/1.png")
## Saving 7 x 5 in image
1.png
# arbitrary width and height
ggsave("./ggsave/2.png", width = 10, height = 5)
2.png
Here we try to generate image of two resolutions, one as normal and one very low.
# normal resolution
ggsave("./ggsave/3.png", dpi = 300)
## Saving 7 x 5 in image
3.png
# low resolution
ggsave("./ggsave/4.png", dpi = 10)
## Saving 7 x 5 in image
4.png
# vector format
ggsave("./ggsave/5.eps", device = "eps")
## Saving 7 x 5 in image
(An ESP file cannot be loaded into an HTML file, so it is ignored.)
# raster format
ggsave("./ggsave/6.bmp", device = "bmp")
## Saving 7 x 5 in image
6.bmp
The diffience is, we can still modify vector format images, while it is not possible to do it on raster format images.
pWithout explicit provision of the plot object p, ggsave() always tries to save the lastest figure (in our case, it is p_new). Therefore, we need to provide the plot object if we want to save other figures, for example, p_ori.
# save p_ori
ggsave("./ggsave/7.png", plot = p_ori)
## Saving 7 x 5 in image
7.png
Make a deeper exploration of the forcats packages.
According to this reference, we try to explore the following functions of forcats packages:
fct_relevel();fct_reorder2();fct_infreq() and fct_rev();fct_recode();fct_collapse(); andfct_lump().fct_relevel()We can use fct_relevel() to manually reorder levels. For example, we can move Canada to be the first data in this example (which in the figure is the bottom one).
drop_levels %>%
# try to only show a continent
filter(continent == "Americas") %>%
# group by country
group_by(country) %>%
# calcuate maximum population for each country
summarize(
max_pop = max(pop)
) %>%
# arrange by max_pop
arrange(max_pop) %>%
# show preview of resultant levels
ggplot(aes(x = max_pop, y = fct_relevel(country, "Canada"), color = country)) +
# make it a scatterplot
geom_point() +
# scale x axis by log10
scale_x_log10() +
# change axis labels
xlab("Maximum population") +
ylab("country") +
# highlight Canada
scale_y_discrete(labels = c("Canada" = expression(bold(Canada)), parse = TRUE))
fct_reorder2()We can use fct_reorder2() to reorder by two variables. For example, in the following example, we reorder by population first then GDP per capita (though it is not obvious).
gapminder %>%
# show only Oceania
filter(continent == "Oceania") %>%
# pop as x axis and gdpPercap as y axis
ggplot(aes(x = pop, y = gdpPercap, color = fct_reorder2(country, pop, gdpPercap, min))) +
# make it a line plot
geom_line() +
# change label
labs(color = "Country") +
# scale x by log10
scale_x_log10()
fct_infreq() and fct_rev()We use fct_infreq() to order levels in increasing frequency and fct_rev() to reverse order. They are usually used in bar plots.
gapminder %>%
# show only a continent
filter(continent == "Americas") %>%
# filter countreis with gdpPercap larger than 15,000
filter(gdpPercap >= 15000) %>%
# reorder levels of country
mutate(
country = country %>% fct_infreq() %>% fct_rev()
) %>%
# country as x axis
ggplot(aes(x = country, fill = country)) +
# make it as a bar plot
geom_bar()
fct_recode()fct_recode() can be used to manully change the names of levels. For example, we try to change the name of “United States” in the above example.
gapminder %>%
# show only a continent
filter(continent == "Americas") %>%
# filter countreis with gdpPercap larger than 15,000
filter(gdpPercap >= 15000) %>%
# reorder levels of country, and change name of "United States"
mutate(
country = country %>% fct_infreq() %>% fct_rev() %>%
fct_recode(
"USA" = "United States"
)
) %>%
# country as x axis
ggplot(aes(x = country, fill = country)) +
# make it as a bar plot
geom_bar()
fct_collapse()We can try to use fct_collapse() to manually combine levels into groups. For example, we combined “Canada” and “USA” together in the above example.
gapminder %>%
# show only a continent
filter(continent == "Americas") %>%
# filter countreis with gdpPercap larger than 15,000
filter(gdpPercap >= 15000) %>%
# reorder levels of country, and change name of "United States", and combine "Canada" and "USA"
mutate(
country = country %>% fct_infreq() %>% fct_rev() %>%
fct_recode(
"USA" = "United States"
) %>%
fct_collapse(
"North Americas" = c("Canada", "USA")
)
) %>%
# country as x axis
ggplot(aes(x = country, fill = country)) +
# make it as a bar plot
geom_bar()
fct_lump()fct_lump() try to automaticaly aggregate groups.
gapminder %>%
# show only a continent
filter(continent == "Americas") %>%
# filter countreis with gdpPercap larger than 15,000
filter(gdpPercap >= 15000) %>%
# reorder levels of country, and let forcats lump groups for us
mutate(
country = country %>% fct_infreq() %>% fct_rev() %>% fct_lump()
) %>%
# country as x axis
ggplot(aes(x = country, fill = country)) +
# make it as a bar plot
geom_bar()